From 33ea028eb0fde1c4c3d3846773617e2c948354ff Mon Sep 17 00:00:00 2001 From: "mafetter@fleming.research" Date: Mon, 9 May 2005 13:22:13 +0000 Subject: [PATCH] bitkeeper revision 1.1385.1.7 (427f6405sUeICnIzUJ_HaXbYnLds4A) Enabling light-weight shadows (especially shadow_mode_dirty). Light-weight shadows leave all the page ref counts based on the guest p.t. pages, while heavy-weight shadows do all their ref counts based on the shadow's p.t. pages. shadow_mode_refcounts(dom) == 1 implies heavy-weight shadows. --- xen/arch/x86/audit.c | 171 ++++--- xen/arch/x86/domain.c | 21 +- xen/arch/x86/domain_build.c | 4 +- xen/arch/x86/mm.c | 338 ++++++------- xen/arch/x86/shadow.c | 617 +++++++++++++++++------ xen/arch/x86/traps.c | 5 +- xen/arch/x86/vmx.c | 4 +- xen/include/asm-x86/mm.h | 27 +- xen/include/asm-x86/page.h | 2 + xen/include/asm-x86/shadow.h | 398 ++++++++++----- xen/include/asm-x86/x86_32/domain_page.h | 47 ++ xen/include/xen/lib.h | 2 +- xen/include/xen/perfc_defn.h | 6 +- 13 files changed, 1092 insertions(+), 550 deletions(-) diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c index 2df4d69be8..b7e874c62d 100644 --- a/xen/arch/x86/audit.c +++ b/xen/arch/x86/audit.c @@ -49,7 +49,8 @@ static int l1, l2, oos_count, page_count; int audit_adjust_pgtables(struct domain *d, int dir, int noisy) { int errors = 0; - int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0; + int shadow_refcounts = !!shadow_mode_refcounts(d); + int shadow_enabled = !!shadow_mode_enabled(d); int l2limit; void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS) @@ -119,7 +120,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) page->count_info += dir; } - void adjust_l2_page(unsigned long mfn) + void adjust_l2_page(unsigned long mfn, int shadow) { unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT); int i; @@ -133,7 +134,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) if ( noisy ) { - if ( shadow_enabled ) + if ( shadow ) { if ( page_get_owner(l1page) != NULL ) { @@ -145,6 +146,17 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) errors++; continue; } + + u32 page_type = l1page->u.inuse.type_info & PGT_type_mask; + + if ( page_type != PGT_l1_shadow ) + { + printk("Audit %d: [Shadow L2 mfn=%lx i=%x] " + "Expected Shadow L1 t=%x mfn=%lx\n", + d->id, mfn, i, + l1page->u.inuse.type_info, l1mfn); + errors++; + } } else { @@ -154,7 +166,9 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) "belonging to other dom %p (id=%d)\n", l1mfn, page_get_owner(l1page), - page_get_owner(l1page)->id); + (page_get_owner(l1page) + ? page_get_owner(l1page)->id + : -1)); errors++; continue; } @@ -179,7 +193,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) } } - adjust(l1page, !shadow_enabled); + adjust(l1page, !shadow); } } @@ -280,7 +294,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) errors++; } - if ( shadow_enabled && + if ( shadow_refcounts && page_is_page_table(gpage) && ! page_out_of_sync(gpage) ) { @@ -336,19 +350,21 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) break; case PGT_l1_shadow: adjust(pfn_to_page(gmfn), 0); - adjust_l1_page(smfn); + if ( shadow_refcounts ) + adjust_l1_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; case PGT_hl2_shadow: adjust(pfn_to_page(gmfn), 0); - adjust_hl2_page(smfn); + if ( shadow_refcounts ) + adjust_hl2_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; case PGT_l2_shadow: adjust(pfn_to_page(gmfn), 0); - adjust_l2_page(smfn); + adjust_l2_page(smfn, 1); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; @@ -391,45 +407,43 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) struct exec_domain *ed; for_each_exec_domain(d, ed) - { - if ( !shadow_enabled ) - { - if ( pagetable_val(ed->arch.guest_table) ) - adjust(&frame_table[pagetable_val(ed->arch.guest_table) - >> PAGE_SHIFT], 1); - } - else - { - if ( pagetable_val(ed->arch.guest_table) ) - adjust(&frame_table[pagetable_val(ed->arch.guest_table) - >> PAGE_SHIFT], 0); - if ( pagetable_val(ed->arch.shadow_table) ) - adjust(&frame_table[pagetable_val(ed->arch.shadow_table) - >> PAGE_SHIFT], 0); - if ( ed->arch.monitor_shadow_ref ) - adjust(&frame_table[ed->arch.monitor_shadow_ref], 0); - } - } + { + if ( pagetable_val(ed->arch.guest_table) ) + adjust(&frame_table[pagetable_get_pfn(ed->arch.guest_table)], 1); + if ( pagetable_val(ed->arch.shadow_table) ) + adjust(&frame_table[pagetable_get_pfn(ed->arch.shadow_table)], 0); + if ( ed->arch.monitor_shadow_ref ) + adjust(&frame_table[ed->arch.monitor_shadow_ref], 0); + } } void adjust_guest_pages() { struct list_head *list_ent = d->page_list.next; struct pfn_info *page; - unsigned long mfn; + unsigned long mfn, snapshot_mfn; while ( list_ent != &d->page_list ) { u32 page_type; page = list_entry(list_ent, struct pfn_info, list); - mfn = page_to_pfn(page); + snapshot_mfn = mfn = page_to_pfn(page); page_type = page->u.inuse.type_info & PGT_type_mask; BUG_ON(page_get_owner(page) != d); page_count++; + if ( shadow_enabled && !shadow_refcounts && + page_out_of_sync(page) ) + { + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT( VALID_M2P(gpfn) ); + snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); + ASSERT( snapshot_mfn ); + } + switch ( page_type ) { case PGT_l2_page_table: @@ -437,7 +451,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) if ( noisy ) { - if ( shadow_enabled ) + if ( shadow_refcounts ) { printk("Audit %d: found an L2 guest page " "mfn=%lx t=%08x c=%08x while in shadow mode\n", @@ -446,19 +460,22 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) errors++; } - if ( (page->u.inuse.type_info & PGT_validated) != - PGT_validated ) + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) { - printk("Audit %d: L2 mfn=%lx not validated %08x\n", - d->id, mfn, page->u.inuse.type_info); - errors++; - } + if ( (page->u.inuse.type_info & PGT_validated) != + PGT_validated ) + { + printk("Audit %d: L2 mfn=%lx not validated %08x\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) - { - printk("Audit %d: L2 mfn=%lx not pinned t=%08x\n", - d->id, mfn, page->u.inuse.type_info); - errors++; + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + { + printk("Audit %d: L2 mfn=%lx not pinned t=%08x\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } } } @@ -466,7 +483,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) adjust(page, 1); if ( page->u.inuse.type_info & PGT_validated ) - adjust_l2_page(mfn); + adjust_l2_page(snapshot_mfn, 0); break; @@ -475,7 +492,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) if ( noisy ) { - if ( shadow_enabled ) + if ( shadow_refcounts ) { printk("found an L1 guest page mfn=%lx t=%08x c=%08x " "while in shadow mode\n", @@ -483,21 +500,24 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) errors++; } - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) - { - printk("Audit %d: L1 not validated mfn=%lx t=%08x\n", - d->id, mfn, page->u.inuse.type_info); - errors++; - } - - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) { - if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) + if ( (page->u.inuse.type_info & PGT_validated) != + PGT_validated ) { - printk("Audit %d: L1 mfn=%lx not pinned t=%08x\n", + printk("Audit %d: L1 not validated mfn=%lx t=%08x\n", d->id, mfn, page->u.inuse.type_info); errors++; } + + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + { + if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) + { + printk("Audit %d: L1 mfn=%lx not pinned t=%08x\n", + d->id, mfn, page->u.inuse.type_info); + } + } } } @@ -505,7 +525,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) adjust(page, 1); if ( page->u.inuse.type_info & PGT_validated ) - adjust_l1_page(mfn); + adjust_l1_page(snapshot_mfn); break; @@ -520,7 +540,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) break; case PGT_writable_page: - if ( shadow_enabled ) + if ( shadow_refcounts ) { // In shadow mode, writable pages can get pinned by // paravirtualized guests that think they are pinning @@ -589,6 +609,8 @@ void audit_pagelist(struct domain *d) void _audit_domain(struct domain *d, int flags) { + int shadow_refcounts = !!shadow_mode_refcounts(d); + void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn, unsigned long mfn) { @@ -608,8 +630,29 @@ void _audit_domain(struct domain *d, int flags) unmap_domain_mem(pt); } + void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn) + { + int i; + active_grant_entry_t *act = d->grant_table->active; + + spin_lock(&d->grant_table->lock); + + for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) + { + if ( act[i].pin && (act[i].frame == xmfn) ) + { + printk(" found active grant table entry i=%d dom=%d pin=%d\n", + i, act[i].domid, act[i].pin); + } + } + + spin_unlock(&d->grant_table->lock); + } + void scan_for_pfn(struct domain *d, unsigned long xmfn) { + scan_for_pfn_in_grant_table(d, xmfn); + if ( !shadow_mode_enabled(d) ) { struct list_head *list_ent = d->page_list.next; @@ -688,7 +731,7 @@ void _audit_domain(struct domain *d, int flags) // Maybe we should just be using BIGLOCK? // - if ( !(flags & AUDIT_ALREADY_LOCKED) ) + if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) ) shadow_lock(d); spin_lock(&d->page_alloc_lock); @@ -716,7 +759,7 @@ void _audit_domain(struct domain *d, int flags) errors++; } - if ( shadow_mode_enabled(d) && + if ( shadow_mode_refcounts(d) && (page_type == PGT_writable_page) && !(page->u.inuse.type_info & PGT_validated) ) { @@ -764,7 +807,9 @@ void _audit_domain(struct domain *d, int flags) mfn); errors++; } - if ( page_type != PGT_writable_page ) + if ( shadow_refcounts + ? (page_type != PGT_writable_page) + : !(page_type && (page_type <= PGT_l4_page_table)) ) { printk("out of sync page mfn=%lx has strange type " "t=%08x c=%08x\n", @@ -821,7 +866,7 @@ void _audit_domain(struct domain *d, int flags) d->id, page->u.inuse.type_info, page->tlbflush_timestamp, page->count_info, mfn); - errors++; + //errors++; } break; default: @@ -835,7 +880,7 @@ void _audit_domain(struct domain *d, int flags) page->count_info, page->u.inuse.type_info, page->tlbflush_timestamp, mfn ); - errors++; + //errors++; scan_for_pfn_remote(mfn); } @@ -870,6 +915,8 @@ void _audit_domain(struct domain *d, int flags) d->id, page_to_pfn(page), page->u.inuse.type_info, page->count_info); + printk("a->gpfn_and_flags=%p\n", + (void *)a->gpfn_and_flags); errors++; } break; @@ -905,7 +952,7 @@ void _audit_domain(struct domain *d, int flags) "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, page_count, oos_count, l1, l2, ctot, ttot); - if ( !(flags & AUDIT_ALREADY_LOCKED) ) + if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) ) shadow_unlock(d); if ( d != current->domain ) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index a5029a45d5..2003ecc6ed 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -359,7 +359,8 @@ static int vmx_final_setup_guest(struct exec_domain *ed, /* Put the domain in shadow mode even though we're going to be using * the shared 1:1 page table initially. It shouldn't hurt */ - shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external); + shadow_mode_enable(ed->domain, + SHM_enable|SHM_refcounts|SHM_translate|SHM_external); } return 0; @@ -450,7 +451,7 @@ int arch_set_info_guest( phys_basetab = c->pt_base; ed->arch.guest_table = mk_pagetable(phys_basetab); - if ( shadow_mode_enabled(d) ) + if ( shadow_mode_refcounts(d) ) { if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) ) return -EINVAL; @@ -991,17 +992,21 @@ void domain_relinquish_resources(struct domain *d) { if ( pagetable_val(ed->arch.guest_table) != 0 ) { - (shadow_mode_enabled(d) ? put_page : put_page_and_type) - (&frame_table[pagetable_val( - ed->arch.guest_table) >> PAGE_SHIFT]); + if ( shadow_mode_refcounts(d) ) + put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]); + else + put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]); + ed->arch.guest_table = mk_pagetable(0); } if ( pagetable_val(ed->arch.guest_table_user) != 0 ) { - (shadow_mode_enabled(d) ? put_page : put_page_and_type) - (&frame_table[pagetable_val( - ed->arch.guest_table_user) >> PAGE_SHIFT]); + if ( shadow_mode_refcounts(d) ) + put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]); + else + put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]); + ed->arch.guest_table_user = mk_pagetable(0); } diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index f1488d6f08..570fd1d33c 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -546,7 +546,7 @@ int construct_dom0(struct domain *d, if ( opt_dom0_shadow || opt_dom0_translate ) { shadow_mode_enable(d, (opt_dom0_translate - ? SHM_enable | SHM_translate + ? SHM_enable | SHM_refcounts | SHM_translate : SHM_enable)); if ( opt_dom0_translate ) { @@ -569,7 +569,7 @@ int construct_dom0(struct domain *d, idle_pg_table[1] = root_create_phys(pagetable_val(d->arch.phys_table), __PAGE_HYPERVISOR); translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT), - pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT); + pagetable_get_pfn(ed->arch.guest_table)); idle_pg_table[1] = root_empty(); local_flush_tlb(); } diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 050a4c0d33..107d2af585 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -316,7 +316,7 @@ int map_ldt_shadow_page(unsigned int off) res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page); - if ( !res && unlikely(shadow_mode_enabled(d)) ) + if ( !res && unlikely(shadow_mode_refcounts(d)) ) { shadow_lock(d); shadow_remove_all_write_access(d, gpfn, gmfn); @@ -392,7 +392,7 @@ get_linear_pagetable( struct pfn_info *page; unsigned long pfn; - ASSERT( !shadow_mode_enabled(d) ); + ASSERT( !shadow_mode_refcounts(d) ); if ( (root_get_flags(re) & _PAGE_RW) ) { @@ -482,7 +482,7 @@ get_page_from_l2e( { int rc; - ASSERT(!shadow_mode_enabled(d)); + ASSERT(!shadow_mode_refcounts(d)); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) return 1; @@ -512,6 +512,8 @@ static int get_page_from_l3e( l3_pgentry_t l3e, unsigned long pfn, struct domain *d) { + ASSERT( !shadow_mode_refcounts(d) ); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return 1; @@ -533,6 +535,8 @@ get_page_from_l4e( { int rc; + ASSERT( !shadow_mode_refcounts(d) ); + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return 1; @@ -641,7 +645,7 @@ static int alloc_l1_table(struct pfn_info *page) l1_pgentry_t *pl1e; int i; - ASSERT(!shadow_mode_enabled(d)); + ASSERT(!shadow_mode_refcounts(d)); pl1e = map_domain_mem(pfn << PAGE_SHIFT); @@ -670,10 +674,12 @@ static int alloc_l2_table(struct pfn_info *page) l2_pgentry_t *pl2e; int i; + // See the code in shadow_promote() to understand why this is here... if ( (PGT_base_page_table == PGT_l2_page_table) && - shadow_mode_enabled(d) ) + unlikely(shadow_mode_refcounts(d)) ) return 1; - ASSERT( !shadow_mode_enabled(d) ); + + ASSERT( !shadow_mode_refcounts(d) ); pl2e = map_domain_mem(pfn << PAGE_SHIFT); @@ -716,7 +722,7 @@ static int alloc_l3_table(struct pfn_info *page) l3_pgentry_t *pl3e = page_to_virt(page); int i; - ASSERT( !shadow_mode_enabled(d) ); + ASSERT( !shadow_mode_refcounts(d) ); for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l3_slot(i) && @@ -741,10 +747,12 @@ static int alloc_l4_table(struct pfn_info *page) l4_pgentry_t *pl4e = page_to_virt(page); int i; + // See the code in shadow_promote() to understand why this is here... if ( (PGT_base_page_table == PGT_l4_page_table) && - shadow_mode_enabled(d) ) + shadow_mode_refcounts(d) ) return 1; - ASSERT( !shadow_mode_enabled(d) ); + + ASSERT( !shadow_mode_refcounts(d) ); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l4_slot(i) && @@ -861,11 +869,12 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) l1_pgentry_t ol1e; struct domain *d = current->domain; - ASSERT( !shadow_mode_enabled(d) ); - if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) return 0; + if ( unlikely(shadow_mode_refcounts(d)) ) + return update_l1e(pl1e, ol1e, nl1e); + if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) @@ -893,7 +902,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) return 0; } - + put_page_from_l1e(ol1e, d); return 1; } @@ -1095,8 +1104,19 @@ int alloc_page_type(struct pfn_info *page, unsigned int type) void free_page_type(struct pfn_info *page, unsigned int type) { struct domain *owner = page_get_owner(page); - if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) ) - return; + unsigned long gpfn; + + if ( owner != NULL ) + { + if ( unlikely(shadow_mode_refcounts(owner)) ) + return; + if ( unlikely(shadow_mode_enabled(owner)) ) + { + gpfn = __mfn_to_gpfn(owner, page_to_pfn(page)); + ASSERT(VALID_M2P(gpfn)); + remove_shadow(owner, gpfn, type); + } + } switch ( type ) { @@ -1287,7 +1307,7 @@ int new_guest_cr3(unsigned long mfn) int okay; unsigned long old_base_mfn; - if ( shadow_mode_enabled(d) ) + if ( shadow_mode_refcounts(d) ) okay = get_page_from_pagenr(mfn, d); else okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); @@ -1296,24 +1316,24 @@ int new_guest_cr3(unsigned long mfn) { invalidate_shadow_ldt(ed); - old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + old_base_mfn = pagetable_get_pfn(ed->arch.guest_table); ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); update_pagetables(ed); /* update shadow_table and monitor_table */ write_ptbase(ed); - if ( shadow_mode_enabled(d) ) + if ( shadow_mode_refcounts(d) ) put_page(&frame_table[old_base_mfn]); else put_page_and_type(&frame_table[old_base_mfn]); - /* CR3 holds its own ref to its shadow. */ + /* CR3 also holds a ref to its shadow... */ if ( shadow_mode_enabled(d) ) { if ( ed->arch.monitor_shadow_ref ) put_shadow_ref(ed->arch.monitor_shadow_ref); ed->arch.monitor_shadow_ref = - pagetable_val(ed->arch.monitor_table) >> PAGE_SHIFT; + pagetable_get_pfn(ed->arch.monitor_table); ASSERT(!page_get_owner(&frame_table[ed->arch.monitor_shadow_ref])); get_shadow_ref(ed->arch.monitor_shadow_ref); } @@ -1486,7 +1506,7 @@ int do_mmuext_op( type = PGT_l1_page_table | PGT_va_mutable; pin_page: - if ( shadow_mode_enabled(FOREIGNDOM) ) + if ( shadow_mode_refcounts(FOREIGNDOM) ) type = PGT_writable_page; okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM); @@ -1557,7 +1577,7 @@ int do_mmuext_op( else { unsigned long old_mfn = - pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT; + pagetable_get_pfn(ed->arch.guest_table_user); ed->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT); if ( old_mfn != 0 ) put_page_and_type(&frame_table[old_mfn]); @@ -1785,13 +1805,16 @@ int do_mmu_update( unsigned int foreigndom) { mmu_update_t req; - unsigned long va = 0, mfn, prev_mfn = 0, gpfn; + void *va; + unsigned long gpfn, mfn; struct pfn_info *page; int rc = 0, okay = 1, i = 0, cpu = smp_processor_id(); unsigned int cmd, done = 0; struct exec_domain *ed = current; struct domain *d = ed->domain; u32 type_info; + struct map_dom_mem_cache mapcache = MAP_DOM_MEM_CACHE_INIT; + struct map_dom_mem_cache sh_mapcache = MAP_DOM_MEM_CACHE_INIT; LOCK_BIGLOCK(d); @@ -1841,8 +1864,6 @@ int do_mmu_update( } cmd = req.ptr & (sizeof(l1_pgentry_t)-1); - mfn = req.ptr >> PAGE_SHIFT; - okay = 0; switch ( cmd ) @@ -1851,73 +1872,75 @@ int do_mmu_update( * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. */ case MMU_NORMAL_PT_UPDATE: + + gpfn = req.ptr >> PAGE_SHIFT; + mfn = __gpfn_to_mfn(d, gpfn); + if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) ) { MEM_LOG("Could not get page for normal update"); break; } - if ( likely(prev_mfn == mfn) ) - { - va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); - } - else - { - if ( prev_mfn != 0 ) - unmap_domain_mem((void *)va); - va = (unsigned long)map_domain_mem(req.ptr); - prev_mfn = mfn; - } - + va = map_domain_mem_with_cache(req.ptr, &mapcache); page = &frame_table[mfn]; + switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) { case PGT_l1_page_table: - ASSERT(!shadow_mode_enabled(d)); + ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type( page, type_info & (PGT_type_mask|PGT_va_mask))) ) { - l1_pgentry_t pte; + l1_pgentry_t l1e; /* FIXME: doesn't work with PAE */ - pte = l1e_create_phys(req.val, req.val); - okay = mod_l1_entry((l1_pgentry_t *)va, pte); + l1e = l1e_create_phys(req.val, req.val); + okay = mod_l1_entry(va, l1e); + if ( okay && unlikely(shadow_mode_enabled(d)) ) + shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache); put_page_type(page); } break; case PGT_l2_page_table: - ASSERT(!shadow_mode_enabled(d)); + ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type(page, PGT_l2_page_table)) ) { l2_pgentry_t l2e; /* FIXME: doesn't work with PAE */ l2e = l2e_create_phys(req.val, req.val); - okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn); + okay = mod_l2_entry(va, l2e, mfn); + if ( okay && unlikely(shadow_mode_enabled(d)) ) + shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache); put_page_type(page); } break; #ifdef __x86_64__ case PGT_l3_page_table: - ASSERT(!shadow_mode_enabled(d)); + ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type(page, PGT_l3_page_table)) ) { l3_pgentry_t l3e; /* FIXME: doesn't work with PAE */ l3e = l3e_create_phys(req.val,req.val); - okay = mod_l3_entry((l3_pgentry_t *)va, l3e, mfn); + okay = mod_l3_entry(va, l3e, mfn); + if ( okay && unlikely(shadow_mode_enabled(d)) ) + shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache); put_page_type(page); } break; case PGT_l4_page_table: - ASSERT(!shadow_mode_enabled(d)); + ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type(page, PGT_l4_page_table)) ) { l4_pgentry_t l4e; l4e = l4e_create_phys(req.val,req.val); - okay = mod_l4_entry((l4_pgentry_t *)va, l4e, mfn); + okay = mod_l4_entry(va, l4e, mfn); + if ( okay && unlikely(shadow_mode_enabled(d)) ) + shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache); put_page_type(page); } break; @@ -1932,9 +1955,6 @@ int do_mmu_update( if ( shadow_mode_log_dirty(d) ) __mark_dirty(d, mfn); - gpfn = __mfn_to_gpfn(d, mfn); - ASSERT(VALID_M2P(gpfn)); - if ( page_is_page_table(page) && !page_out_of_sync(page) ) { @@ -1953,24 +1973,29 @@ int do_mmu_update( break; } + unmap_domain_mem_with_cache(va, &mapcache); + put_page(page); break; case MMU_MACHPHYS_UPDATE: + mfn = req.ptr >> PAGE_SHIFT; + gpfn = req.val; + /* HACK ALERT... Need to think about this some more... */ if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) ) { - rc = FOREIGNDOM->next_io_page++; - printk("privileged guest dom%d requests mfn=%lx for dom%d, " - "gets pfn=%x\n", - d->id, mfn, FOREIGNDOM->id, rc); - set_machinetophys(mfn, rc); - set_p2m_entry(FOREIGNDOM, rc, mfn); + shadow_lock(FOREIGNDOM); + printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n", + d->id, gpfn, mfn, FOREIGNDOM->id); + set_machinetophys(mfn, gpfn); + set_p2m_entry(FOREIGNDOM, gpfn, mfn, NULL, NULL); okay = 1; + shadow_unlock(FOREIGNDOM); break; } - + if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) ) { MEM_LOG("Could not get page for mach->phys update"); @@ -1983,7 +2008,7 @@ int do_mmu_update( break; } - set_machinetophys(mfn, req.val); + set_machinetophys(mfn, gpfn); okay = 1; /* @@ -2012,8 +2037,8 @@ int do_mmu_update( } out: - if ( prev_mfn != 0 ) - unmap_domain_mem((void *)va); + unmap_domain_mem_cache(&mapcache); + unmap_domain_mem_cache(&sh_mapcache); process_deferred_ops(cpu); @@ -2031,73 +2056,6 @@ int do_mmu_update( /* This function assumes the caller is holding the domain's BIGLOCK * and is running in a shadow mode */ -int update_shadow_va_mapping(unsigned long va, - l1_pgentry_t val, - struct exec_domain *ed, - struct domain *d) -{ - unsigned long l1mfn; - l1_pgentry_t spte; - int rc = 0; - - check_pagetable(ed, "pre-va"); /* debug */ - shadow_lock(d); - - // This is actually overkill - we don't need to sync the L1 itself, - // just everything involved in getting to this L1 (i.e. we need - // linear_pg_table[l1_linear_offset(va)] to be in sync)... - // - __shadow_sync_va(ed, va); - -#if 1 /* keep check_pagetables() happy */ - /* - * However, the above doesn't guarantee that there's no snapshot of - * the L1 table in question; it just says that the relevant L2 and L1 - * entries for VA are in-sync. There might still be a snapshot. - * - * The checking code in _check_pagetables() assumes that no one will - * mutate the shadow of a page that has a snapshot. It's actually - * OK to not sync this page, but it seems simpler to: - * 1) keep all code paths the same, and - * 2) maintain the invariant for _check_pagetables(), rather than try - * to teach it about this boundary case. - * So we flush this L1 page, if it's out of sync. - */ - l1mfn = l2e_get_pfn(linear_l2_table(ed)[l2_table_offset(va)]); - if ( mfn_out_of_sync(l1mfn) ) - { - perfc_incrc(extra_va_update_sync); - __shadow_sync_mfn(d, l1mfn); - } -#endif /* keep check_pagetables() happy */ - - if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], - &val, sizeof(val)))) - { - rc = -EINVAL; - goto out; - } - - // also need to update the shadow - - l1pte_propagate_from_guest(d, val, &spte); - shadow_set_l1e(va, spte, 0); - - /* - * If we're in log-dirty mode then we need to note that we've updated - * the PTE in the PT-holding page. We need the machine frame number - * for this. - */ - if ( shadow_mode_log_dirty(d) ) - mark_dirty(d, va_to_l1mfn(ed, va)); - - out: - shadow_unlock(d); - check_pagetable(ed, "post-va"); /* debug */ - - return rc; -} - int update_grant_va_mapping(unsigned long va, l1_pgentry_t _nl1e, struct domain *d, @@ -2116,11 +2074,17 @@ int update_grant_va_mapping(unsigned long va, cleanup_writable_pagetable(d); + // This is actually overkill - we don't need to sync the L1 itself, + // just everything involved in getting to this L1 (i.e. we need + // linear_pg_table[l1_linear_offset(va)] to be in sync)... + // + __shadow_sync_va(ed, va); + pl1e = &linear_pg_table[l1_linear_offset(va)]; if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) rc = -EINVAL; - else + else if ( !shadow_mode_refcounts(d) ) { if ( update_l1e(pl1e, ol1e, _nl1e) ) { @@ -2133,9 +2097,14 @@ int update_grant_va_mapping(unsigned long va, else rc = -EINVAL; } + else + { + printk("grant tables and shadow mode currently don't work together\n"); + BUG(); + } if ( unlikely(shadow_mode_enabled(d)) ) - update_shadow_va_mapping(va, _nl1e, ed, d); + shadow_do_update_va_mapping(va, _nl1e, ed); return rc; } @@ -2161,6 +2130,13 @@ int do_update_va_mapping(unsigned long va, cleanup_writable_pagetable(d); if ( unlikely(shadow_mode_enabled(d)) ) + check_pagetable(ed, "pre-va"); /* debug */ + + if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], + val)) ) + rc = -EINVAL; + + if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) { if ( unlikely(percpu_info[cpu].foreign && (shadow_mode_translate(d) || @@ -2173,11 +2149,10 @@ int do_update_va_mapping(unsigned long va, domain_crash(); } - rc = update_shadow_va_mapping(va, val, ed, d); + rc = shadow_do_update_va_mapping(va, val, ed); + + check_pagetable(ed, "post-va"); /* debug */ } - else if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], - val)) ) - rc = -EINVAL; switch ( flags & UVMF_FLUSHTYPE_MASK ) { @@ -2468,14 +2443,68 @@ int ptwr_debug = 0x0; #define PTWR_PRINTK(_f, _a...) ((void)0) #endif +/* Re-validate a given p.t. page, given its prior snapshot */ +int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot) +{ + l1_pgentry_t ol1e, nl1e; + int modified = 0, i; + +#if 0 + if ( d->id ) + printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__, + l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]), + l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)])); +#endif + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + ol1e = snapshot[i]; + nl1e = l1page[i]; + + if ( likely(l1e_get_value(ol1e) == l1e_get_value(nl1e)) ) + continue; + + /* Update number of entries modified. */ + modified++; + + /* + * Fast path for PTEs that have merely been write-protected + * (e.g., during a Unix fork()). A strict reduction in privilege. + */ + if ( likely(l1e_get_value(ol1e) == (l1e_get_value(nl1e)|_PAGE_RW)) ) + { + if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) ) + put_page_type(&frame_table[l1e_get_pfn(nl1e)]); + continue; + } + + if ( unlikely(!get_page_from_l1e(nl1e, d)) ) + { + MEM_LOG("ptwr: Could not re-validate l1 page\n"); + /* + * Make the remaining p.t's consistent before crashing, so the + * reference counts are correct. + */ + memcpy(&l1page[i], &snapshot[i], + (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t)); + domain_crash(); + break; + } + + put_page_from_l1e(ol1e, d); + } + + return modified; +} + + /* Flush the given writable p.t. page and write-protect it again. */ void ptwr_flush(struct domain *d, const int which) { unsigned long pte, *ptep, l1va; - l1_pgentry_t *pl1e, ol1e, nl1e; + l1_pgentry_t *pl1e; l2_pgentry_t *pl2e; - int i; - unsigned int modified = 0; + unsigned int modified; ASSERT(!shadow_mode_enabled(d)); @@ -2524,45 +2553,8 @@ void ptwr_flush(struct domain *d, const int which) */ pl1e = d->arch.ptwr[which].pl1e; - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - { - ol1e = d->arch.ptwr[which].page[i]; - nl1e = pl1e[i]; - - if ( likely(l1e_get_value(ol1e) == l1e_get_value(nl1e)) ) - continue; - - /* Update number of entries modified. */ - modified++; - - /* - * Fast path for PTEs that have merely been write-protected - * (e.g., during a Unix fork()). A strict reduction in privilege. - */ - if ( likely(l1e_get_value(ol1e) == (l1e_get_value(nl1e)|_PAGE_RW)) ) - { - if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) ) - put_page_type(&frame_table[l1e_get_pfn(nl1e)]); - continue; - } - - if ( unlikely(!get_page_from_l1e(nl1e, d)) ) - { - MEM_LOG("ptwr: Could not re-validate l1 page\n"); - /* - * Make the remaining p.t's consistent before crashing, so the - * reference counts are correct. - */ - memcpy(&pl1e[i], &d->arch.ptwr[which].page[i], - (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t)); - domain_crash(); - break; - } - - put_page_from_l1e(ol1e, d); - } + modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page); unmap_domain_mem(pl1e); - perfc_incr_histo(wpt_updates, modified, PT_UPDATES); d->arch.ptwr[which].prev_nr_updates = modified; diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 1db8ed71e9..817757c36a 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -30,11 +30,17 @@ #include #include +#define MFN_PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned) + static void shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry); static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn); static void free_writable_pte_predictions(struct domain *d); +#if SHADOW_DEBUG +static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn); +#endif + /******** There's a per-domain shadow table spin lock which works fine for SMP @@ -62,6 +68,9 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, __shadow_sync_mfn(d, gmfn); } + if ( !shadow_mode_refcounts(d) ) + return 1; + if ( unlikely(page_is_page_table(page)) ) return 1; @@ -89,7 +98,7 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, // TLB flushes required when promoting a writable page, and also deal // with any outstanding (external) writable refs to this page (by // refusing to promote it). The pinning headache complicates this - // code -- it would all much get simpler if we stop using + // code -- it would all get much simpler if we stop using // shadow_lock() and move the shadow code to BIGLOCK(). // if ( unlikely(!get_page(page, d)) ) @@ -130,6 +139,9 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, static inline void shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) { + if ( !shadow_mode_refcounts(d) ) + return; + ASSERT(frame_table[gmfn].count_info & PGC_page_table); if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none ) @@ -210,7 +222,7 @@ alloc_shadow_page(struct domain *d, else { page = alloc_domheap_page(NULL); - void *l1 = map_domain_mem(page_to_pfn(page) << PAGE_SHIFT); + void *l1 = map_domain_mem(page_to_phys(page)); memset(l1, 0, PAGE_SIZE); unmap_domain_mem(l1); } @@ -312,7 +324,7 @@ free_shadow_l1_table(struct domain *d, unsigned long smfn) for ( i = min; i <= max; i++ ) { - put_page_from_l1e(pl1e[i], d); + shadow_put_page_from_l1e(pl1e[i], d); pl1e[i] = l1e_empty(); } @@ -348,21 +360,20 @@ free_shadow_hl2_table(struct domain *d, unsigned long smfn) static void inline free_shadow_l2_table(struct domain *d, unsigned long smfn) { - unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT); + l2_pgentry_t *pl2e = map_domain_mem(smfn << PAGE_SHIFT); int i, external = shadow_mode_external(d); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) if ( external || is_guest_l2_slot(i) ) - if ( pl2e[i] & _PAGE_PRESENT ) - put_shadow_ref(pl2e[i] >> PAGE_SHIFT); + if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) + put_shadow_ref(l2e_get_pfn(pl2e[i])); if ( (PGT_base_page_table == PGT_l2_page_table) && shadow_mode_translate(d) && !external ) { // free the ref to the hl2 // - put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] - >> PAGE_SHIFT); + put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)])); } unmap_domain_mem(pl2e); @@ -428,6 +439,26 @@ void free_shadow_page(unsigned long smfn) free_domheap_page(page); } +void +remove_shadow(struct domain *d, unsigned long gpfn, u32 stype) +{ + unsigned long smfn; + + //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype); + + shadow_lock(d); + + while ( stype >= PGT_l1_shadow ) + { + smfn = __shadow_status(d, gpfn, stype); + if ( smfn && MFN_PINNED(smfn) ) + shadow_unpin(smfn); + stype -= PGT_l1_shadow; + } + + shadow_unlock(d); +} + static void inline release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) { @@ -537,15 +568,22 @@ static void free_shadow_pages(struct domain *d) // free_out_of_sync_state(d); - // second, remove any outstanding refs from ed->arch.shadow_table... + // second, remove any outstanding refs from ed->arch.shadow_table + // and CR3. // for_each_exec_domain(d, ed) { if ( pagetable_val(ed->arch.shadow_table) ) { - put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT); + put_shadow_ref(pagetable_get_pfn(ed->arch.shadow_table)); ed->arch.shadow_table = mk_pagetable(0); } + + if ( ed->arch.monitor_shadow_ref ) + { + put_shadow_ref(ed->arch.monitor_shadow_ref); + ed->arch.monitor_shadow_ref = 0; + } } // For external shadows, remove the monitor table's refs @@ -584,7 +622,6 @@ static void free_shadow_pages(struct domain *d) // under us... First, collect the list of pinned pages, then // free them. // -#define PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned) for ( i = 0; i < shadow_ht_buckets; i++ ) { u32 count; @@ -596,7 +633,7 @@ static void free_shadow_pages(struct domain *d) count = 0; for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( PINNED(x->smfn) ) + if ( MFN_PINNED(x->smfn) ) count++; if ( !count ) continue; @@ -604,7 +641,7 @@ static void free_shadow_pages(struct domain *d) mfn_list = xmalloc_array(unsigned long, count); count = 0; for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( PINNED(x->smfn) ) + if ( MFN_PINNED(x->smfn) ) mfn_list[count++] = x->smfn; while ( count ) @@ -613,7 +650,18 @@ static void free_shadow_pages(struct domain *d) } xfree(mfn_list); } -#undef PINNED + + // Now free the pre-zero'ed pages from the domain + // + struct list_head *list_ent, *tmp; + list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames) + { + list_del(list_ent); + perfc_decr(free_l1_pages); + + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + free_domheap_page(page); + } shadow_audit(d, 0); @@ -624,9 +672,9 @@ void shadow_mode_init(void) { } -int _shadow_mode_enabled(struct domain *d) +int _shadow_mode_refcounts(struct domain *d) { - return shadow_mode_enabled(d); + return shadow_mode_refcounts(d); } static void alloc_monitor_pagetable(struct exec_domain *ed) @@ -706,7 +754,7 @@ void free_monitor_pagetable(struct exec_domain *ed) /* * Then free monitor_table. */ - mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT; + mfn = pagetable_get_pfn(ed->arch.monitor_table); free_domheap_page(&frame_table[mfn]); ed->arch.monitor_table = mk_pagetable(0); @@ -714,7 +762,9 @@ void free_monitor_pagetable(struct exec_domain *ed) } int -set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn) +set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn, + struct map_dom_mem_cache *l2cache, + struct map_dom_mem_cache *l1cache) { unsigned long phystab = pagetable_val(d->arch.phys_table); l2_pgentry_t *l2, l2e; @@ -724,26 +774,29 @@ set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn) ASSERT( phystab ); - l2 = map_domain_mem(phystab); + l2 = map_domain_mem_with_cache(phystab, l2cache); l2e = l2[l2_table_offset(va)]; - if ( !l2e_get_value(l2e) ) /* FIXME: check present bit? */ + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) { l1page = alloc_domheap_page(NULL); if ( !l1page ) + { + unmap_domain_mem_with_cache(l2, l2cache); return 0; + } - l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT); + l1 = map_domain_mem_with_cache(page_to_phys(l1page), l1cache); memset(l1, 0, PAGE_SIZE); - unmap_domain_mem(l1); + unmap_domain_mem_with_cache(l1, l1cache); l2e = l2e_create_pfn(page_to_pfn(l1page), __PAGE_HYPERVISOR); l2[l2_table_offset(va)] = l2e; } - unmap_domain_mem(l2); + unmap_domain_mem_with_cache(l2, l2cache); - l1 = map_domain_mem(l2e_get_phys(l2e)); + l1 = map_domain_mem_with_cache(l2e_get_phys(l2e), l1cache); l1[l1_table_offset(va)] = l1e_create_pfn(mfn, __PAGE_HYPERVISOR); - unmap_domain_mem(l1); + unmap_domain_mem_with_cache(l1, l1cache); return 1; } @@ -755,14 +808,16 @@ alloc_p2m_table(struct domain *d) struct pfn_info *page, *l2page; l2_pgentry_t *l2; unsigned long mfn, pfn; + struct map_dom_mem_cache l2cache = MAP_DOM_MEM_CACHE_INIT; + struct map_dom_mem_cache l1cache = MAP_DOM_MEM_CACHE_INIT; l2page = alloc_domheap_page(NULL); if ( !l2page ) return 0; - d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT); - l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT); + d->arch.phys_table = mk_pagetable(page_to_phys(l2page)); + l2 = map_domain_mem_with_cache(page_to_phys(l2page), &l2cache); memset(l2, 0, PAGE_SIZE); - unmap_domain_mem(l2); + unmap_domain_mem_with_cache(l2, &l2cache); list_ent = d->page_list.next; while ( list_ent != &d->page_list ) @@ -773,7 +828,7 @@ alloc_p2m_table(struct domain *d) ASSERT(pfn != INVALID_M2P_ENTRY); ASSERT(pfn < (1u<<20)); - set_p2m_entry(d, pfn, mfn); + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); list_ent = page->list.next; } @@ -787,12 +842,15 @@ alloc_p2m_table(struct domain *d) if ( (pfn != INVALID_M2P_ENTRY) && (pfn < (1u<<20)) ) { - set_p2m_entry(d, pfn, mfn); + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); } list_ent = page->list.next; } + unmap_domain_mem_cache(&l2cache); + unmap_domain_mem_cache(&l1cache); + return 1; } @@ -915,13 +973,13 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) { // external guests provide their own memory for their P2M maps. // - ASSERT( d == page_get_owner(&frame_table[pagetable_val( - d->arch.phys_table)>>PAGE_SHIFT]) ); + ASSERT( d == page_get_owner( + &frame_table[pagetable_get_pfn(d->arch.phys_table)]) ); } } printk("audit1\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); printk("audit1 done\n"); // Get rid of any shadow pages from any previous shadow mode. @@ -929,15 +987,9 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) free_shadow_pages(d); printk("audit2\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); printk("audit2 done\n"); - // Turn off writable page tables. - // It doesn't mix with shadow mode. - // And shadow mode offers a superset of functionality. - // - vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables); - /* * Tear down it's counts by disassembling its page-table-based ref counts. * Also remove CR3's gcount/tcount. @@ -959,23 +1011,27 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) * Assert that no pages are left with L1/L2/L3/L4 type. */ audit_adjust_pgtables(d, -1, 1); + d->arch.shadow_mode = mode; - struct list_head *list_ent = d->page_list.next; - while ( list_ent != &d->page_list ) + if ( shadow_mode_refcounts(d) ) { - struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); - if ( !get_page_type(page, PGT_writable_page) ) - BUG(); - put_page_type(page); + struct list_head *list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + if ( !get_page_type(page, PGT_writable_page) ) + BUG(); + put_page_type(page); - list_ent = page->list.next; + list_ent = page->list.next; + } } audit_adjust_pgtables(d, 1, 1); printk("audit3\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); printk("audit3 done\n"); return 0; @@ -1120,8 +1176,8 @@ void __shadow_mode_disable(struct domain *d) * Currently this does not fix up page ref counts, so it is valid to call * only when a domain is being destroyed. */ - BUG_ON(!test_bit(DF_DYING, &d->d_flags)); - d->arch.shadow_tainted_refcnts = 1; + BUG_ON(!test_bit(DF_DYING, &d->d_flags) && shadow_mode_refcounts(d)); + d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d); free_shadow_pages(d); free_writable_pte_predictions(d); @@ -1138,11 +1194,17 @@ void __shadow_mode_disable(struct domain *d) } } #endif - + d->arch.shadow_mode = 0; free_shadow_ht_entries(d); free_out_of_sync_entries(d); + + struct exec_domain *ed; + for_each_exec_domain(d, ed) + { + update_pagetables(ed); + } } static int shadow_mode_table_op( @@ -1281,6 +1343,7 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) switch ( op ) { case DOM0_SHADOW_CONTROL_OP_OFF: + __shadow_sync_all(d); __shadow_mode_disable(d); break; @@ -1298,7 +1361,7 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE: free_shadow_pages(d); rc = __shadow_mode_enable( - d, d->arch.shadow_mode|SHM_enable|SHM_translate); + d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate); break; default: @@ -1560,23 +1623,23 @@ void shadow_map_l1_into_current_l2(unsigned long va) if ( init_table ) { + l1_pgentry_t sl1e; + int index = l1_table_offset(va); + int min = 1, max = 0; + gpl1e = &(linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)]); spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)]); - l1_pgentry_t sl1e; - int index = l1_table_offset(va); - int min = 1, max = 0; - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { l1pte_propagate_from_guest(d, gpl1e[i], &sl1e); if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(sl1e, d) ) + unlikely(!shadow_get_page_from_l1e(sl1e, d)) ) sl1e = l1e_empty(); - if ( l1e_get_value(sl1e) == 0 ) /* FIXME: check flags? */ + if ( l1e_get_flags(sl1e) == 0 ) { // First copy entries from 0 until first invalid. // Then copy entries from index until first invalid. @@ -1695,7 +1758,8 @@ shadow_make_snapshot( if ( !get_shadow_ref(smfn) ) BUG(); - if ( shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow ) + if ( shadow_mode_refcounts(d) && + (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) ) min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp; pfn_to_page(smfn)->tlbflush_timestamp = min_max; @@ -1748,7 +1812,18 @@ shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn, ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(pfn_valid(mfn)); - ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page); + +#ifndef NDEBUG + u32 type = page->u.inuse.type_info & PGT_type_mask; + if ( shadow_mode_refcounts(d) ) + { + ASSERT(type == PGT_writable_page); + } + else + { + ASSERT(type && (type < PGT_l4_page_table)); + } +#endif FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__, gpfn, mfn, page->count_info, page->u.inuse.type_info); @@ -1766,6 +1841,10 @@ shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn, entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); entry->writable_pl1e = -1; +#if SHADOW_DEBUG + mark_shadows_as_reflecting_snapshot(d, gpfn); +#endif + // increment guest's ref count to represent the entry in the // full shadow out-of-sync list. // @@ -1859,7 +1938,7 @@ static int snapshot_entry_matches( int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va) { struct domain *d = ed->domain; - unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + unsigned long l2mfn = pagetable_get_pfn(ed->arch.guest_table); l2_pgentry_t l2e; unsigned long l1mfn; @@ -1867,6 +1946,10 @@ int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va) perfc_incrc(shadow_out_of_sync_calls); + // PERF BUG: snapshot_entry_matches will call map_domain_mem() on the l2 + // page, but it's already available at ed->arch.guest_vtable... + // Ditto for the sl2 page and ed->arch.shadow_vtable. + // if ( page_out_of_sync(&frame_table[l2mfn]) && !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) ) return 1; @@ -1881,6 +1964,10 @@ int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va) if ( !VALID_MFN(l1mfn) ) return 0; + // PERF BUG: snapshot_entry_matches will call map_domain_mem() on the l1 + // page, but it's already available at linear_pg_table[l1_linear_offset()]. + // Ditto for the sl1 page and shadow_linear_pg_table[]... + // if ( page_out_of_sync(&frame_table[l1mfn]) && !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) ) return 1; @@ -2002,7 +2089,7 @@ static u32 remove_all_write_access_in_ptpage( found++; pt[i] = new; if ( is_l1_shadow ) - put_page_from_l1e(old, d); + shadow_put_page_from_l1e(old, d); #if 0 printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x " @@ -2060,8 +2147,7 @@ int shadow_remove_all_write_access( // write_refs = (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask); - if ( write_refs && - (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) ) + if ( write_refs && MFN_PINNED(readonly_gmfn) ) { write_refs--; } @@ -2141,7 +2227,7 @@ static u32 remove_all_access_in_page( count++; if ( is_l1_shadow ) - put_page_from_l1e(ol2e, d); + shadow_put_page_from_l1e(ol2e, d); else /* must be an hl2 page */ put_page(&frame_table[forbidden_gmfn]); } @@ -2210,8 +2296,23 @@ static int resync_all(struct domain *d, u32 stype) if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) continue; - if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) ) - continue; + smfn = __shadow_status(d, entry->gpfn, stype); + + if ( !smfn ) + { + if ( shadow_mode_refcounts(d) ) + continue; + + // For light weight shadows, even when no shadow page exists, + // we need to resync the refcounts to the new contents of the + // guest page. + // This only applies when we have writable page tables. + // + if ( (stype == PGT_l1_shadow) && !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) + continue; + if ( (stype != PGT_l1_shadow) && !shadow_mode_write_all(d) ) + continue; + } FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx", stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn); @@ -2221,12 +2322,29 @@ static int resync_all(struct domain *d, u32 stype) // guest = map_domain_mem(entry->gmfn << PAGE_SHIFT); snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT); - shadow = map_domain_mem(smfn << PAGE_SHIFT); + + if ( smfn ) + shadow = map_domain_mem(smfn << PAGE_SHIFT); + else + shadow = NULL; + unshadow = 0; switch ( stype ) { case PGT_l1_shadow: { + l1_pgentry_t *guest1 = guest; + l1_pgentry_t *shadow1 = shadow; + l1_pgentry_t *snapshot1 = snapshot; + + ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)); + + if ( !shadow_mode_refcounts(d) ) + revalidate_l1(d, guest1, snapshot1); + + if ( !smfn ) + break; + u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp; int min_shadow = SHADOW_MIN(min_max_shadow); int max_shadow = SHADOW_MAX(min_max_shadow); @@ -2236,10 +2354,6 @@ static int resync_all(struct domain *d, u32 stype) int min_snapshot = SHADOW_MIN(min_max_snapshot); int max_snapshot = SHADOW_MAX(min_max_snapshot); - l1_pgentry_t *guest1 = guest; - l1_pgentry_t *shadow1 = shadow; - l1_pgentry_t *snapshot1 = snapshot; - changed = 0; for ( i = min_shadow; i <= max_shadow; i++ ) @@ -2270,6 +2384,9 @@ static int resync_all(struct domain *d, u32 stype) l2_pgentry_t *shadow2 = shadow; l2_pgentry_t *snapshot2 = snapshot; + ASSERT(shadow_mode_write_all(d)); + BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented + changed = 0; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { @@ -2295,8 +2412,7 @@ static int resync_all(struct domain *d, u32 stype) // Need a better solution long term. if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) && unlikely(l2e_get_value(new_pde) != 0) && - !unshadow && - (frame_table[smfn].u.inuse.type_info & PGT_pinned) ) + !unshadow && MFN_PINNED(smfn) ) unshadow = 1; } if ( max == -1 ) @@ -2311,6 +2427,9 @@ static int resync_all(struct domain *d, u32 stype) l2_pgentry_t *snapshot2 = snapshot; l1_pgentry_t *shadow2 = shadow; + ASSERT(shadow_mode_write_all(d)); + BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented + changed = 0; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { @@ -2338,7 +2457,8 @@ static int resync_all(struct domain *d, u32 stype) BUG(); } - unmap_domain_mem(shadow); + if ( smfn ) + unmap_domain_mem(shadow); unmap_domain_mem(snapshot); unmap_domain_mem(guest); @@ -2351,7 +2471,7 @@ static int resync_all(struct domain *d, u32 stype) unsigned long hl2mfn; if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) && - (frame_table[hl2mfn].u.inuse.type_info & PGT_pinned) ) + MFN_PINNED(hl2mfn) ) shadow_unpin(hl2mfn); } } @@ -2388,7 +2508,7 @@ void __shadow_sync_all(struct domain *d) !shadow_get_page_from_l1e(npte, d) ) BUG(); *ppte = npte; - put_page_from_l1e(opte, d); + shadow_put_page_from_l1e(opte, d); unmap_domain_mem(ppte); } @@ -2475,13 +2595,23 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) /* Write fault? */ if ( regs->error_code & 2 ) { + int allow_writes = 0; + if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) ) { - /* Write fault on a read-only mapping. */ - SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", - l1e_get_value(gpte)); - perfc_incrc(shadow_fault_bail_ro_mapping); - goto fail; + if ( shadow_mode_page_writable(d, l1e_get_pfn(gpte)) ) + { + allow_writes = 1; + l1e_add_flags(&gpte, _PAGE_RW); + } + else + { + /* Write fault on a read-only mapping. */ + SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", + l1e_get_value(gpte)); + perfc_incrc(shadow_fault_bail_ro_mapping); + goto fail; + } } if ( !l1pte_write_fault(ed, &gpte, &spte, va) ) @@ -2491,6 +2621,9 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) shadow_unlock(d); return 0; } + + if ( allow_writes ) + l1e_remove_flags(&gpte, _PAGE_RW); } else { @@ -2506,21 +2639,22 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) /* * STEP 3. Write the modified shadow PTE and guest PTE back to the tables. */ - - /* XXX Watch out for read-only L2 entries! (not used in Linux). */ - if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], - &gpte, sizeof(gpte))) ) + if ( l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK) ) { - printk("shadow_fault() failed, crashing domain %d " - "due to a read-only L2 page table (gpde=%lx), va=%lx\n", - d->id, l2e_get_value(gpde), va); - domain_crash_synchronous(); - } + /* XXX Watch out for read-only L2 entries! (not used in Linux). */ + if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], + &gpte, sizeof(gpte))) ) + { + printk("%s() failed, crashing domain %d " + "due to a read-only L2 page table (gpde=%lx), va=%lx\n", + __func__, d->id, l2e_get_value(gpde), va); + domain_crash_synchronous(); + } - // if necessary, record the page table page as dirty - if ( unlikely(shadow_mode_log_dirty(d)) && - l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK)) - mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde))); + // if necessary, record the page table page as dirty + if ( unlikely(shadow_mode_log_dirty(d)) ) + __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde))); + } shadow_set_l1e(va, spte, 1); @@ -2537,6 +2671,109 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) return 0; } +void shadow_l1_normal_pt_update( + struct domain *d, + unsigned long pa, l1_pgentry_t gpte, + struct map_dom_mem_cache *cache) +{ + unsigned long sl1mfn; + l1_pgentry_t *spl1e, spte; + + shadow_lock(d); + + sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow); + if ( sl1mfn ) + { + SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%08lx", + (void *)pa, l1e_get_value(gpte)); + l1pte_propagate_from_guest(current->domain, gpte, &spte); + + spl1e = map_domain_mem_with_cache(sl1mfn << PAGE_SHIFT, cache); + spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte; + unmap_domain_mem_with_cache(spl1e, cache); + } + + shadow_unlock(d); +} + +void shadow_l2_normal_pt_update( + struct domain *d, + unsigned long pa, l2_pgentry_t gpde, + struct map_dom_mem_cache *cache) +{ + unsigned long sl2mfn; + l2_pgentry_t *spl2e; + + shadow_lock(d); + + sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow); + if ( sl2mfn ) + { + SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%08lx", + (void *)pa, l2e_get_value(gpde)); + spl2e = map_domain_mem_with_cache(sl2mfn << PAGE_SHIFT, cache); + validate_pde_change(d, gpde, + &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]); + unmap_domain_mem_with_cache(spl2e, cache); + } + + shadow_unlock(d); +} + +#ifdef __x86_64__ +void shadow_l3_normal_pt_update( + struct domain *d, + unsigned long pa, l3_pgentry_t gpde, + struct map_dom_mem_cache *cache) +{ + BUG(); // not yet implemented +} + +void shadow_l4_normal_pt_update( + struct domain *d, + unsigned long pa, l4_pgentry_t gpde, + struct map_dom_mem_cache *cache) +{ + BUG(); // not yet implemented +} +#endif + +int shadow_do_update_va_mapping(unsigned long va, + l1_pgentry_t val, + struct exec_domain *ed) +{ + struct domain *d = ed->domain; + l1_pgentry_t spte; + int rc = 0; + + shadow_lock(d); + + //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void *)l1e_get_value(val)); + + // This is actually overkill - we don't need to sync the L1 itself, + // just everything involved in getting to this L1 (i.e. we need + // linear_pg_table[l1_linear_offset(va)] to be in sync)... + // + __shadow_sync_va(ed, va); + + l1pte_propagate_from_guest(d, val, &spte); + shadow_set_l1e(va, spte, 0); + + /* + * If we're in log-dirty mode then we need to note that we've updated + * the PTE in the PT-holding page. We need the machine frame number + * for this. + */ + if ( shadow_mode_log_dirty(d) ) + __mark_dirty(d, va_to_l1mfn(ed, va)); + +// out: + shadow_unlock(d); + + return rc; +} + + /* * What lives where in the 32-bit address space in the various shadow modes, * and what it uses to get/maintain that mapping. @@ -2566,7 +2803,7 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) void __update_pagetables(struct exec_domain *ed) { struct domain *d = ed->domain; - unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + unsigned long gmfn = pagetable_get_pfn(ed->arch.guest_table); unsigned long gpfn = __mfn_to_gpfn(d, gmfn); unsigned long smfn, hl2mfn, old_smfn; @@ -2595,7 +2832,7 @@ void __update_pagetables(struct exec_domain *ed) smfn = shadow_l2_table(d, gpfn, gmfn); if ( !get_shadow_ref(smfn) ) BUG(); - old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT; + old_smfn = pagetable_get_pfn(ed->arch.shadow_table); ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT); if ( old_smfn ) put_shadow_ref(old_smfn); @@ -2665,6 +2902,47 @@ void __update_pagetables(struct exec_domain *ed) #if SHADOW_DEBUG +// The following is entirely for _check_pagetable()'s benefit. +// _check_pagetable() wants to know whether a given entry in a +// shadow page table is supposed to be the shadow of the guest's +// current entry, or the shadow of the entry held in the snapshot +// taken above. +// +// Here, we mark all currently existing entries as reflecting +// the snapshot, above. All other places in xen that update +// the shadow will keep the shadow in sync with the guest's +// entries (via l1pte_propagate_from_guest and friends), which clear +// the SHADOW_REFLECTS_SNAPSHOT bit. +// +static void +mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn) +{ + unsigned long smfn; + l1_pgentry_t *l1e; + l2_pgentry_t *l2e; + unsigned i; + + if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) ) + { + l1e = map_domain_mem(smfn << PAGE_SHIFT); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + if ( is_guest_l1_slot(i) && + (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) ) + l1e_add_flags(&l1e[i], SHADOW_REFLECTS_SNAPSHOT); + unmap_domain_mem(l1e); + } + + if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) ) + { + l2e = map_domain_mem(smfn << PAGE_SHIFT); + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + if ( is_guest_l2_slot(i) && + (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) ) + l2e_add_flags(&l2e[i], SHADOW_REFLECTS_SNAPSHOT); + unmap_domain_mem(l2e); + } +} + // BUG: these are not SMP safe... static int sh_l2_present; static int sh_l1_present; @@ -2687,96 +2965,109 @@ int shadow_status_noswap; #define FAIL(_f, _a...) \ do { \ - printk("XXX %s-FAIL (%d,%d,%d)" _f " at %s(%d)\n", \ + printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \ sh_check_name, level, l2_idx, l1_idx, ## _a, \ __FILE__, __LINE__); \ - printk("g=%lx s=%lx &g=%p &s=%p" \ - " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08x\n", \ - l1e_get_value(gpte), l1e_get_value(spte), pgpte, pspte, \ - v2m(ed, pgpte), v2m(ed, pspte), \ + printk("guest_pte=%lx eff_guest_pte=%lx shadow_pte=%lx " \ + "snapshot_pte=%lx &guest=%p &shadow=%p &snap=%p " \ + "v2m(&guest)=%p v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \ + l1e_get_value(guest_pte), l1e_get_value(eff_guest_pte), \ + l1e_get_value(shadow_pte), l1e_get_value(snapshot_pte), \ + p_guest_pte, p_shadow_pte, p_snapshot_pte, \ + (void *)v2m(ed, p_guest_pte), (void *)v2m(ed, p_shadow_pte), \ + (void *)v2m(ed, p_snapshot_pte), \ (l2_idx << L2_PAGETABLE_SHIFT) | \ (l1_idx << L1_PAGETABLE_SHIFT)); \ errors++; \ } while ( 0 ) static int check_pte( - struct exec_domain *ed, l1_pgentry_t *pgpte, l1_pgentry_t *pspte, - int level, int l2_idx, int l1_idx, int oos_ptes) + struct exec_domain *ed, + l1_pgentry_t *p_guest_pte, + l1_pgentry_t *p_shadow_pte, + l1_pgentry_t *p_snapshot_pte, + int level, int l2_idx, int l1_idx) { struct domain *d = ed->domain; - l1_pgentry_t gpte = *pgpte; - l1_pgentry_t spte = *pspte; - unsigned long mask, gpfn, smfn, gmfn; - int errors = 0; + l1_pgentry_t guest_pte = *p_guest_pte; + l1_pgentry_t shadow_pte = *p_shadow_pte; + l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty(); + l1_pgentry_t eff_guest_pte; + unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn; + int errors = 0, guest_writable; int page_table_page; - if ( (l1e_get_value(spte) == 0) || - (l1e_get_value(spte) == 0xdeadface) || - (l1e_get_value(spte) == 0x00000E00) ) + if ( (l1e_get_value(shadow_pte) == 0) || + (l1e_get_value(shadow_pte) == 0xdeadface) || + (l1e_get_value(shadow_pte) == 0x00000E00) ) return errors; /* always safe */ - if ( !(l1e_get_flags(spte) & _PAGE_PRESENT) ) - FAIL("Non zero not present spte"); + if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) ) + FAIL("Non zero not present shadow_pte"); if ( level == 2 ) sh_l2_present++; if ( level == 1 ) sh_l1_present++; - if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) ) + if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte ) + eff_guest_pte = snapshot_pte; + else + eff_guest_pte = guest_pte; + + if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) ) FAIL("Guest not present yet shadow is"); - mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK); + mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK); - if ( (l1e_get_value(spte) & mask) != (l1e_get_value(gpte) & mask) ) + if ( ((l1e_get_value(shadow_pte) & mask) != (l1e_get_value(eff_guest_pte) & mask)) ) FAIL("Corrupt?"); if ( (level == 1) && - (l1e_get_flags(spte) & _PAGE_DIRTY) && - !(l1e_get_flags(gpte) & _PAGE_DIRTY) && !oos_ptes ) + (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) && + !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) ) FAIL("Dirty coherence"); - if ( (l1e_get_flags(spte) & _PAGE_ACCESSED) && - !(l1e_get_flags(gpte) & _PAGE_ACCESSED) && !oos_ptes ) + if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) && + !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) ) FAIL("Accessed coherence"); - if ( l1e_get_flags(spte) & _PAGE_GLOBAL ) + if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL ) FAIL("global bit set in shadow"); - smfn = l1e_get_pfn(spte); - gpfn = l1e_get_pfn(gpte); - gmfn = __gpfn_to_mfn(d, gpfn); + eff_guest_pfn = l1e_get_pfn(eff_guest_pte); + eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn); + shadow_mfn = l1e_get_pfn(shadow_pte); + + if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) ) + FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%lx\n", __func__, eff_guest_pfn, + l1e_get_value(eff_guest_pte)); - if ( !VALID_MFN(gmfn) ) - FAIL("%s: invalid gpfn=%lx gpte=%lx\n", __func__, gpfn, - l1e_get_value(gpte)); + page_table_page = mfn_is_page_table(eff_guest_mfn); - page_table_page = mfn_is_page_table(gmfn); + guest_writable = + (l1e_get_flags(eff_guest_pte) & _PAGE_RW) || + (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && mfn_out_of_sync(eff_guest_mfn)); - if ( (l1e_get_flags(spte) & _PAGE_RW ) && - !(l1e_get_flags(gpte) & _PAGE_RW) && !oos_ptes ) + if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable ) { - printk("gpfn=%lx gmfn=%lx smfn=%lx t=0x%08x page_table_page=%d " - "oos_ptes=%d\n", - gpfn, gmfn, smfn, - frame_table[gmfn].u.inuse.type_info, - page_table_page, oos_ptes); + printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n", + eff_guest_pfn, eff_guest_mfn, shadow_mfn, + frame_table[eff_guest_mfn].u.inuse.type_info, + page_table_page); FAIL("RW coherence"); } if ( (level == 1) && - (l1e_get_flags(spte) & _PAGE_RW ) && - !((l1e_get_flags(gpte) & _PAGE_RW) && - (l1e_get_flags(gpte) & _PAGE_DIRTY)) && - !oos_ptes ) - { - printk("gpfn=%lx gmfn=%lx smfn=%lx t=0x%08x page_table_page=%d " - "oos_ptes=%d\n", - gpfn, gmfn, smfn, - frame_table[gmfn].u.inuse.type_info, - page_table_page, oos_ptes); + (l1e_get_flags(shadow_pte) & _PAGE_RW ) && + !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) ) + { + printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n", + eff_guest_pfn, eff_guest_mfn, shadow_mfn, + frame_table[eff_guest_mfn].u.inuse.type_info, + page_table_page); FAIL("RW2 coherence"); } - if ( gmfn == smfn ) + if ( eff_guest_mfn == shadow_mfn ) { if ( level > 1 ) FAIL("Linear map ???"); /* XXX this will fail on BSD */ @@ -2788,9 +3079,9 @@ static int check_pte( if ( level == 2 ) { - if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn ) - FAIL("smfn problem gpfn=%lx smfn=%lx", gpfn, - __shadow_status(d, gpfn, PGT_l1_shadow)); + if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn ) + FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn, + __shadow_status(d, eff_guest_pfn, PGT_l1_shadow)); } else BUG(); // XXX -- not handled yet. @@ -2807,24 +3098,29 @@ static int check_l1_table( { struct domain *d = ed->domain; int i; - l1_pgentry_t *gpl1e, *spl1e; - int errors = 0, oos_ptes = 0; + unsigned long snapshot_mfn; + l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL; + int errors = 0; if ( page_out_of_sync(pfn_to_page(gmfn)) ) { - gmfn = __shadow_status(d, gpfn, PGT_snapshot); - oos_ptes = 1; - ASSERT(gmfn); + snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); + ASSERT(snapshot_mfn); + p_snapshot = map_domain_mem(snapshot_mfn << PAGE_SHIFT); } - gpl1e = map_domain_mem(gmfn << PAGE_SHIFT); - spl1e = map_domain_mem(smfn << PAGE_SHIFT); + p_guest = map_domain_mem(gmfn << PAGE_SHIFT); + p_shadow = map_domain_mem(smfn << PAGE_SHIFT); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - errors += check_pte(ed, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes); + errors += check_pte(ed, p_guest+i, p_shadow+i, + p_snapshot ? p_snapshot+i : NULL, + 1, l2_idx, i); - unmap_domain_mem(spl1e); - unmap_domain_mem(gpl1e); + unmap_domain_mem(p_shadow); + unmap_domain_mem(p_guest); + if ( p_snapshot ) + unmap_domain_mem(p_snapshot); return errors; } @@ -2909,7 +3205,8 @@ int check_l2_table( errors += check_pte(ed, (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */ (l1_pgentry_t*)(&spl2e[i]), - 2, i, 0, 0); + NULL, + 2, i, 0); unmap_domain_mem(spl2e); unmap_domain_mem(gpl2e); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 948dd1802b..4f87b7f96f 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -270,7 +270,8 @@ asmlinkage int do_page_fault(struct xen_regs *regs) perfc_incrc(page_faults); - if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && + !shadow_mode_enabled(d)) ) { LOCK_BIGLOCK(d); if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && @@ -286,8 +287,6 @@ asmlinkage int do_page_fault(struct xen_regs *regs) ((regs->error_code & 3) == 3) && /* write-protection fault */ ptwr_do_page_fault(d, addr) ) { - if ( unlikely(shadow_mode_enabled(d)) ) - (void)shadow_fault(addr, regs); UNLOCK_BIGLOCK(d); return EXCRET_fault_fixed; } diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c index db82c73eac..b677b9dc24 100644 --- a/xen/arch/x86/vmx.c +++ b/xen/arch/x86/vmx.c @@ -672,7 +672,7 @@ static int vmx_set_cr0(unsigned long value) d->arch.arch_vmx.cpu_cr3); domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT; + old_base_mfn = pagetable_get_pfn(d->arch.guest_table); if (old_base_mfn) put_page(pfn_to_page(old_base_mfn)); @@ -798,7 +798,7 @@ static int mov_to_cr(int gp, int cr, struct xen_regs *regs) "Invalid CR3 value=%lx", value); domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT; + old_base_mfn = pagetable_get_pfn(d->arch.guest_table); d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); if (old_base_mfn) put_page(pfn_to_page(old_base_mfn)); diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 014fa7e45d..cf4e88bc6d 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -150,7 +150,7 @@ extern void invalidate_shadow_ldt(struct exec_domain *d); extern int shadow_remove_all_write_access( struct domain *d, unsigned long gpfn, unsigned long gmfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); -extern int _shadow_mode_enabled(struct domain *d); +extern int _shadow_mode_refcounts(struct domain *d); static inline void put_page(struct pfn_info *page) { @@ -182,7 +182,7 @@ static inline int get_page(struct pfn_info *page, unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - if ( !_shadow_mode_enabled(domain) ) + if ( !_shadow_mode_refcounts(domain) ) DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%08x\n", page_to_pfn(page), domain, unpickle_domptr(d), x, page->u.inuse.type_info); @@ -315,14 +315,21 @@ int ptwr_init(struct domain *); void ptwr_destroy(struct domain *); void ptwr_flush(struct domain *, const int); int ptwr_do_page_fault(struct domain *, unsigned long); +int revalidate_l1(struct domain *, l1_pgentry_t *, l1_pgentry_t *); #define cleanup_writable_pagetable(_d) \ do { \ - if ( unlikely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) ) { \ - if ( (_d)->arch.ptwr[PTWR_PT_ACTIVE].l1va ) \ - ptwr_flush((_d), PTWR_PT_ACTIVE); \ - if ( (_d)->arch.ptwr[PTWR_PT_INACTIVE].l1va ) \ - ptwr_flush((_d), PTWR_PT_INACTIVE); \ + if ( likely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) ) \ + { \ + if ( likely(!shadow_mode_enabled(_d)) ) \ + { \ + if ( (_d)->arch.ptwr[PTWR_PT_ACTIVE].l1va ) \ + ptwr_flush((_d), PTWR_PT_ACTIVE); \ + if ( (_d)->arch.ptwr[PTWR_PT_INACTIVE].l1va ) \ + ptwr_flush((_d), PTWR_PT_INACTIVE); \ + } \ + else \ + shadow_sync_all(_d); \ } \ } while ( 0 ) @@ -330,9 +337,9 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy); #ifndef NDEBUG -#define AUDIT_ALREADY_LOCKED ( 1u << 0 ) -#define AUDIT_ERRORS_OK ( 1u << 1 ) -#define AUDIT_QUIET ( 1u << 2 ) +#define AUDIT_SHADOW_ALREADY_LOCKED ( 1u << 0 ) +#define AUDIT_ERRORS_OK ( 1u << 1 ) +#define AUDIT_QUIET ( 1u << 2 ) void _audit_domain(struct domain *d, int flags); #define audit_domain(_d) _audit_domain((_d), AUDIT_ERRORS_OK) diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index a925f90fc8..59e582e4cd 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -23,6 +23,7 @@ #ifndef __ASSEMBLY__ typedef struct { unsigned long pt_lo; } pagetable_t; #define pagetable_val(_x) ((_x).pt_lo) +#define pagetable_get_pfn(_x) ((_x).pt_lo >> PAGE_SHIFT) #define mk_pagetable(_x) ( (pagetable_t) { (_x) } ) #endif @@ -103,6 +104,7 @@ extern void paging_init(void); #define _PAGE_PAT 0x080UL #define _PAGE_PSE 0x080UL #define _PAGE_GLOBAL 0x100UL +#define _PAGE_AVAIL 0xe00UL #define __PAGE_HYPERVISOR \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index 88ea9e6dac..146b75ed9d 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -33,11 +33,17 @@ /* Shadow PT operation mode : shadow-mode variable in arch_domain. */ #define SHM_enable (1<<0) /* we're in one of the shadow modes */ -#define SHM_log_dirty (1<<1) /* enable log dirty mode */ -#define SHM_translate (1<<2) /* do p2m tranaltion on guest tables */ -#define SHM_external (1<<3) /* external page table, not used by Xen */ +#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of + guest tables */ +#define SHM_write_all (1<<2) /* allow write access to all guest pt pages, + regardless of pte write permissions */ +#define SHM_log_dirty (1<<3) /* enable log dirty mode */ +#define SHM_translate (1<<4) /* do p2m tranaltion on guest tables */ +#define SHM_external (1<<5) /* external page table, not used by Xen */ #define shadow_mode_enabled(_d) ((_d)->arch.shadow_mode) +#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts) +#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all) #define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty) #define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate) #define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external) @@ -72,7 +78,29 @@ extern void free_monitor_pagetable(struct exec_domain *ed); extern void __shadow_sync_all(struct domain *d); extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va); extern int set_p2m_entry( - struct domain *d, unsigned long pfn, unsigned long mfn); + struct domain *d, unsigned long pfn, unsigned long mfn, + struct map_dom_mem_cache *l2cache, + struct map_dom_mem_cache *l1cache); +extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype); + +extern void shadow_l1_normal_pt_update(struct domain *d, + unsigned long pa, l1_pgentry_t l1e, + struct map_dom_mem_cache *cache); +extern void shadow_l2_normal_pt_update(struct domain *d, + unsigned long pa, l2_pgentry_t l2e, + struct map_dom_mem_cache *cache); +#ifdef __x86_64__ +extern void shadow_l3_normal_pt_update(struct domain *d, + unsigned long pa, l3_pgentry_t l3e, + struct map_dom_mem_cache *cache); +extern void shadow_l4_normal_pt_update(struct domain *d, + unsigned long pa, l4_pgentry_t l4e, + struct map_dom_mem_cache *cache); +#endif +extern int shadow_do_update_va_mapping(unsigned long va, + l1_pgentry_t val, + struct exec_domain *ed); + static inline unsigned long __shadow_status( struct domain *d, unsigned long gpfn, unsigned long stype); @@ -82,7 +110,13 @@ extern void vmx_shadow_clear_state(struct domain *); static inline int page_is_page_table(struct pfn_info *page) { - return page->count_info & PGC_page_table; + struct domain *owner = page_get_owner(page); + + if ( owner && shadow_mode_refcounts(owner) ) + return page->count_info & PGC_page_table; + + u32 type_info = page->u.inuse.type_info & PGT_type_mask; + return type_info && (type_info <= PGT_l4_page_table); } static inline int mfn_is_page_table(unsigned long mfn) @@ -90,7 +124,7 @@ static inline int mfn_is_page_table(unsigned long mfn) if ( !pfn_valid(mfn) ) return 0; - return frame_table[mfn].count_info & PGC_page_table; + return page_is_page_table(pfn_to_page(mfn)); } static inline int page_out_of_sync(struct pfn_info *page) @@ -103,7 +137,7 @@ static inline int mfn_out_of_sync(unsigned long mfn) if ( !pfn_valid(mfn) ) return 0; - return frame_table[mfn].count_info & PGC_out_of_sync; + return page_out_of_sync(pfn_to_page(mfn)); } @@ -191,10 +225,12 @@ static inline void shadow_mode_disable(struct domain *d) : (mfn) ) #define __gpfn_to_mfn(_d, gpfn) \ - ( (shadow_mode_translate(_d)) \ - ? ({ ASSERT(current->domain == (_d)); \ - phys_to_machine_mapping(gpfn); }) \ - : (gpfn) ) + ({ \ + ASSERT(current->domain == (_d)); \ + (shadow_mode_translate(_d)) \ + ? phys_to_machine_mapping(gpfn) \ + : (gpfn); \ + }) #define __gpfn_to_mfn_foreign(_d, gpfn) \ ( (shadow_mode_translate(_d)) \ @@ -237,6 +273,8 @@ struct out_of_sync_entry { #if SHADOW_DEBUG extern int shadow_status_noswap; +#define _SHADOW_REFLECTS_SNAPSHOT ( 9) +#define SHADOW_REFLECTS_SNAPSHOT (1u << _SHADOW_REFLECTS_SNAPSHOT) #endif #ifdef VERBOSE @@ -292,15 +330,18 @@ shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d) ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT); + if ( !shadow_mode_refcounts(d) ) + return 1; + nl1e = l1e; l1e_remove_flags(&nl1e, _PAGE_GLOBAL); res = get_page_from_l1e(nl1e, d); if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) && - !(l1e_get_flags(l1e) & L1_DISALLOW_MASK) && - (mfn = l1e_get_pfn(l1e)) && + !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) && + (mfn = l1e_get_pfn(nl1e)) && pfn_valid(mfn) && - (owner = page_get_owner(pfn_to_page(l1e_get_pfn(l1e)))) && + (owner = page_get_owner(pfn_to_page(mfn))) && (d != owner) ) { res = get_page_from_l1e(nl1e, owner); @@ -319,6 +360,103 @@ shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d) return res; } +static inline void +shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + put_page_from_l1e(l1e, d); +} + +static inline void +shadow_put_page_type(struct domain *d, struct pfn_info *page) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + put_page_type(page); +} + +static inline int shadow_get_page(struct domain *d, + struct pfn_info *page, + struct domain *owner) +{ + if ( !shadow_mode_refcounts(d) ) + return 1; + return get_page(page, owner); +} + +static inline void shadow_put_page(struct domain *d, + struct pfn_info *page) +{ + if ( !shadow_mode_refcounts(d) ) + return; + put_page(page); +} + +/************************************************************************/ + +static inline int __mark_dirty(struct domain *d, unsigned int mfn) +{ + unsigned long pfn; + int rc = 0; + + ASSERT(spin_is_locked(&d->arch.shadow_lock)); + ASSERT(d->arch.shadow_dirty_bitmap != NULL); + + if ( !VALID_MFN(mfn) ) + return rc; + + // N.B. This doesn't use __mfn_to_gpfn(). + // This wants the nice compact set of PFNs from 0..domain's max, + // which __mfn_to_gpfn() only returns for translated domains. + // + pfn = machine_to_phys_mapping[mfn]; + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) ) + return rc; + + if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) + { + /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ + if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) + { + d->arch.shadow_dirty_count++; + rc = 1; + } + } +#ifndef NDEBUG + else if ( mfn < max_page ) + { + SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)", + mfn, pfn, d->arch.shadow_dirty_bitmap_size, d); + SH_LOG("dom=%p caf=%08x taf=%08x", + page_get_owner(&frame_table[mfn]), + frame_table[mfn].count_info, + frame_table[mfn].u.inuse.type_info ); + } +#endif + + return rc; +} + + +static inline int mark_dirty(struct domain *d, unsigned int mfn) +{ + int rc; + shadow_lock(d); + rc = __mark_dirty(d, mfn); + shadow_unlock(d); + return rc; +} + + /************************************************************************/ static inline void @@ -350,10 +488,15 @@ static inline void __guest_set_l2e( struct exec_domain *ed, unsigned long va, l2_pgentry_t value) { + struct domain *d = ed->domain; + ed->arch.guest_vtable[l2_table_offset(va)] = value; - if ( unlikely(shadow_mode_translate(ed->domain)) ) + if ( unlikely(shadow_mode_translate(d)) ) update_hl2e(ed, va); + + if ( unlikely(shadow_mode_log_dirty(d)) ) + __mark_dirty(d, pagetable_get_pfn(ed->arch.guest_table)); } static inline void @@ -380,11 +523,12 @@ update_hl2e(struct exec_domain *ed, unsigned long va) if ( (l1e_has_changed(&old_hl2e, &new_hl2e, _PAGE_PRESENT)) ) { if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) && - !get_page(pfn_to_page(l1e_get_pfn(new_hl2e)), ed->domain) ) + !shadow_get_page(ed->domain, pfn_to_page(l1e_get_pfn(new_hl2e)), + ed->domain) ) new_hl2e = l1e_empty(); if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) { - put_page(pfn_to_page(l1e_get_pfn(old_hl2e))); + shadow_put_page(ed->domain, pfn_to_page(l1e_get_pfn(old_hl2e))); need_flush = 1; } } @@ -401,7 +545,7 @@ update_hl2e(struct exec_domain *ed, unsigned long va) static inline void shadow_drop_references( struct domain *d, struct pfn_info *page) { - if ( likely(!shadow_mode_enabled(d)) || + if ( likely(!shadow_mode_refcounts(d)) || ((page->u.inuse.type_info & PGT_count_mask) == 0) ) return; @@ -423,7 +567,7 @@ static inline void shadow_drop_references( static inline void shadow_sync_and_drop_references( struct domain *d, struct pfn_info *page) { - if ( likely(!shadow_mode_enabled(d)) ) + if ( likely(!shadow_mode_refcounts(d)) ) return; shadow_lock(d); @@ -520,64 +664,6 @@ shadow_unpin(unsigned long smfn) } -/************************************************************************/ - -static inline int __mark_dirty(struct domain *d, unsigned int mfn) -{ - unsigned long pfn; - int rc = 0; - - ASSERT(spin_is_locked(&d->arch.shadow_lock)); - ASSERT(d->arch.shadow_dirty_bitmap != NULL); - - if ( !VALID_MFN(mfn) ) - return rc; - - pfn = __mfn_to_gpfn(d, mfn); - - /* - * Values with the MSB set denote MFNs that aren't really part of the - * domain's pseudo-physical memory map (e.g., the shared info frame). - * Nothing to do here... - */ - if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) ) - return rc; - - if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) - { - /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ - if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) - { - d->arch.shadow_dirty_count++; - rc = 1; - } - } -#ifndef NDEBUG - else if ( mfn < max_page ) - { - SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)", - mfn, pfn, d->arch.shadow_dirty_bitmap_size, d); - SH_LOG("dom=%p caf=%08x taf=%08x\n", - page_get_owner(&frame_table[mfn]), - frame_table[mfn].count_info, - frame_table[mfn].u.inuse.type_info ); - } -#endif - - return rc; -} - - -static inline int mark_dirty(struct domain *d, unsigned int mfn) -{ - int rc; - shadow_lock(d); - rc = __mark_dirty(d, mfn); - shadow_unlock(d); - return rc; -} - - /************************************************************************/ extern void shadow_mark_va_out_of_sync( @@ -666,8 +752,10 @@ static inline void l1pte_propagate_from_guest( (_PAGE_PRESENT|_PAGE_ACCESSED)) && VALID_MFN(mfn = __gpfn_to_mfn(d, l1e_get_pfn(gpte))) ) { - spte = l1e_create_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - + spte = l1e_create_pfn(mfn, + l1e_get_flags(gpte) & + ~(_PAGE_GLOBAL | _PAGE_AVAIL)); + if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) || mfn_is_page_table(mfn) ) @@ -729,14 +817,13 @@ static inline void l2pde_general( spde = l2e_empty(); if ( (l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) ) { - spde = l2e_create_pfn(sl1mfn, - l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED); - l2e_add_flags(&gpde, _PAGE_ACCESSED); /* N.B. PDEs do not have a dirty bit. */ + spde = l2e_create_pfn(sl1mfn, + (l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) + & ~(_PAGE_AVAIL)); + + /* N.B. PDEs do not have a dirty bit. */ + l2e_add_flags(&gpde, _PAGE_ACCESSED); - // XXX mafetter: Hmm... - // Shouldn't the dirty log be checked/updated here? - // Actually, it needs to be done in this function's callers. - // *gpde_p = gpde; } @@ -769,34 +856,57 @@ validate_pte_change( l1_pgentry_t *shadow_pte_p) { l1_pgentry_t old_spte, new_spte; + int need_flush = 0; perfc_incrc(validate_pte_calls); -#if 0 - FSH_LOG("validate_pte(old=%lx new=%lx)", old_pte, new_pte); -#endif - - old_spte = *shadow_pte_p; l1pte_propagate_from_guest(d, new_pte, &new_spte); - // only do the ref counting if something important changed. - // - if ( ((l1e_get_value(old_spte) | l1e_get_value(new_spte)) & _PAGE_PRESENT ) && - l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) ) + if ( shadow_mode_refcounts(d) ) { - perfc_incrc(validate_pte_changes); + old_spte = *shadow_pte_p; - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) - new_spte = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - put_page_from_l1e(old_spte, d); + if ( l1e_get_value(old_spte) == l1e_get_value(new_spte) ) + { + // No accounting required... + // + perfc_incrc(validate_pte_changes1); + } + else if ( l1e_get_value(old_spte) == (l1e_get_value(new_spte)|_PAGE_RW) ) + { + // Fast path for PTEs that have merely been write-protected + // (e.g., during a Unix fork()). A strict reduction in privilege. + // + perfc_incrc(validate_pte_changes2); + if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) ) + shadow_put_page_type(d, &frame_table[l1e_get_pfn(new_spte)]); + } + else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) & + _PAGE_PRESENT ) && + l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) ) + { + // only do the ref counting if something important changed. + // + perfc_incrc(validate_pte_changes3); + + if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && + !shadow_get_page_from_l1e(new_spte, d) ) + new_spte = l1e_empty(); + if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) + { + shadow_put_page_from_l1e(old_spte, d); + need_flush = 1; + } + } + else + { + perfc_incrc(validate_pte_changes4); + } } *shadow_pte_p = new_spte; - // paranoia rules! - return 1; + return need_flush; } // returns true if a tlb flush is needed @@ -808,6 +918,7 @@ validate_hl2e_change( l1_pgentry_t *shadow_hl2e_p) { l1_pgentry_t old_hl2e, new_hl2e; + int need_flush = 0; perfc_incrc(validate_hl2e_calls); @@ -825,14 +936,15 @@ validate_hl2e_change( !get_page(pfn_to_page(l1e_get_pfn(new_hl2e)), d) ) new_hl2e = l1e_empty(); if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) + { put_page(pfn_to_page(l1e_get_pfn(old_hl2e))); + need_flush = 1; + } } *shadow_hl2e_p = new_hl2e; - // paranoia rules! - return 1; - + return need_flush; } // returns true if a tlb flush is needed @@ -844,15 +956,13 @@ validate_pde_change( l2_pgentry_t *shadow_pde_p) { l2_pgentry_t old_spde, new_spde; + int need_flush = 0; perfc_incrc(validate_pde_calls); old_spde = *shadow_pde_p; l2pde_propagate_from_guest(d, &new_gpde, &new_spde); - // XXX Shouldn't we propagate the new_gpde to the guest? - // And then mark the guest's L2 page as dirty? - // Only do the ref counting if something important changed. // if ( ((l2e_get_value(old_spde) | l2e_get_value(new_spde)) & _PAGE_PRESENT) && @@ -864,13 +974,15 @@ validate_pde_change( !get_shadow_ref(l2e_get_pfn(new_spde)) ) BUG(); if ( l2e_get_flags(old_spde) & _PAGE_PRESENT ) + { put_shadow_ref(l2e_get_pfn(old_spde)); + need_flush = 1; + } } *shadow_pde_p = new_spde; - // paranoia rules! - return 1; + return need_flush; } /*********************************************************************/ @@ -1035,10 +1147,19 @@ static inline unsigned long __shadow_status( { perfc_incrc(shadow_status_shortcut); #ifndef NDEBUG - ASSERT(___shadow_status(d, gpfn, stype) == 0); + if ( ___shadow_status(d, gpfn, stype) != 0 ) + { + printk("d->id=%d gpfn=%lx gmfn=%lx stype=%lx c=%x t=%x " + "mfn_out_of_sync(gmfn)=%d mfn_is_page_table(gmfn)=%d\n", + d->id, gpfn, gmfn, stype, + frame_table[gmfn].count_info, + frame_table[gmfn].u.inuse.type_info, + mfn_out_of_sync(gmfn), mfn_is_page_table(gmfn)); + BUG(); + } - // Undo the affects of the above ASSERT on ___shadow_status()'s perf - // counters. + // Undo the affects of the above call to ___shadow_status()'s perf + // counters, since that call is really just part of an assertion. // perfc_decrc(shadow_status_calls); perfc_decrc(shadow_status_miss); @@ -1056,12 +1177,12 @@ static inline unsigned long __shadow_status( * * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table. */ -static inline unsigned long +static inline u32 shadow_max_pgtable_type(struct domain *d, unsigned long gpfn, unsigned long *smfn) { struct shadow_status *x; - unsigned long pttype = PGT_none, type; + u32 pttype = PGT_none, type; ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(gpfn == (gpfn & PGT_mfn_mask)); @@ -1379,7 +1500,6 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) struct exec_domain *ed = current; struct domain *d = ed->domain; l2_pgentry_t sl2e; - l1_pgentry_t old_spte; #if 0 printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n", @@ -1424,17 +1544,20 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) } } - old_spte = shadow_linear_pg_table[l1_linear_offset(va)]; - - // only do the ref counting if something important changed. - // - if ( l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) ) + if ( shadow_mode_refcounts(d) ) { - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) - new_spte = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - put_page_from_l1e(old_spte, d); + l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)]; + + // only do the ref counting if something important changed. + // + if ( l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) ) + { + if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && + !shadow_get_page_from_l1e(new_spte, d) ) + new_spte = l1e_empty(); + if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) + shadow_put_page_from_l1e(old_spte, d); + } } shadow_linear_pg_table[l1_linear_offset(va)] = new_spte; @@ -1444,6 +1567,27 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) /************************************************************************/ +static inline int +shadow_mode_page_writable(struct domain *d, unsigned long gpfn) +{ + unsigned long mfn = __gpfn_to_mfn(d, gpfn); + u32 type = frame_table[mfn].u.inuse.type_info & PGT_type_mask; + + if ( shadow_mode_refcounts(d) && + (type == PGT_writable_page) ) + type = shadow_max_pgtable_type(d, gpfn, NULL); + + if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && + (type == PGT_l1_page_table) ) + return 1; + + if ( shadow_mode_write_all(d) && + type && (type <= PGT_l4_page_table) ) + return 1; + + return 0; +} + static inline l1_pgentry_t gva_to_gpte(unsigned long gva) { l2_pgentry_t gpde; diff --git a/xen/include/asm-x86/x86_32/domain_page.h b/xen/include/asm-x86/x86_32/domain_page.h index f72bce7936..0220de530e 100644 --- a/xen/include/asm-x86/x86_32/domain_page.h +++ b/xen/include/asm-x86/x86_32/domain_page.h @@ -26,4 +26,51 @@ extern void *map_domain_mem(unsigned long pa); */ extern void unmap_domain_mem(void *va); +struct map_dom_mem_cache { + unsigned long pa; + void *va; +}; + +#define MAP_DOM_MEM_CACHE_INIT { .pa = 0 } + +static inline void * +map_domain_mem_with_cache(unsigned long pa, + struct map_dom_mem_cache *cache) +{ + if ( likely(cache != NULL) ) + { + if ( likely(cache->pa) ) + { + if ( likely((pa & PAGE_MASK) == (cache->pa & PAGE_MASK)) ) + goto done; + unmap_domain_mem(cache->va); + } + cache->pa = (pa & PAGE_MASK) | 1; + cache->va = map_domain_mem(cache->pa); + done: + return (void *)(((unsigned long)cache->va & PAGE_MASK) | + (pa & ~PAGE_MASK)); + } + + return map_domain_mem(pa); +} + +static inline void +unmap_domain_mem_with_cache(void *va, + struct map_dom_mem_cache *cache) +{ + if ( unlikely(!cache) ) + unmap_domain_mem(va); +} + +static inline void +unmap_domain_mem_cache(struct map_dom_mem_cache *cache) +{ + if ( likely(cache != NULL) && likely(cache->pa) ) + { + unmap_domain_mem(cache->va); + cache->pa = 0; + } +} + #endif /* __ASM_DOMAIN_PAGE_H__ */ diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h index 313635f82a..91c34ce232 100644 --- a/xen/include/xen/lib.h +++ b/xen/include/xen/lib.h @@ -15,7 +15,7 @@ #define BUG_ON(_p) do { if (_p) BUG(); } while ( 0 ) #ifndef NDEBUG -#define ASSERT(_p) if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s\n", #_p , __LINE__, __FILE__); BUG(); } +#define ASSERT(_p) { if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s\n", #_p , __LINE__, __FILE__); BUG(); } } #else #define ASSERT(_p) ((void)0) #endif diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index 501851ec55..c7f2493a1a 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -86,12 +86,14 @@ PERFCOUNTER_CPU(resync_hl2, "resync HL2 page") PERFCOUNTER_CPU(shadow_make_snapshot, "snapshots created") PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync") PERFCOUNTER_CPU(shadow_out_of_sync_calls, "calls to shadow_out_of_sync") -PERFCOUNTER_CPU(extra_va_update_sync, "extra syncs for bug in chk_pgtb") PERFCOUNTER_CPU(snapshot_entry_matches_calls, "calls to ss_entry_matches") PERFCOUNTER_CPU(snapshot_entry_matches_true, "ss_entry_matches returns true") PERFCOUNTER_CPU(validate_pte_calls, "calls to validate_pte_change") -PERFCOUNTER_CPU(validate_pte_changes, "validate_pte makes changes") +PERFCOUNTER_CPU(validate_pte_changes1, "validate_pte makes changes1") +PERFCOUNTER_CPU(validate_pte_changes2, "validate_pte makes changes2") +PERFCOUNTER_CPU(validate_pte_changes3, "validate_pte makes changes3") +PERFCOUNTER_CPU(validate_pte_changes4, "validate_pte makes changes4") PERFCOUNTER_CPU(validate_pde_calls, "calls to validate_pde_change") PERFCOUNTER_CPU(validate_pde_changes, "validate_pde makes changes") PERFCOUNTER_CPU(shadow_get_page_fail, "shadow_get_page_from_l1e fails" ) -- 2.30.2